import pandas as pd
import numpy as np
import seaborn as sns
import mglearn
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')
df = pd.read_csv("C:/Users/Juanjo/Escritorio/MasterDataScience/MachLern/practica/data/heartdisease.csv")
df.head()
| HeartDisease | BMI | Smoking | AlcoholDrinking | Stroke | PhysicalHealth | MentalHealth | DiffWalking | Sex | AgeCategory | Race | Diabetic | PhysicalActivity | GenHealth | SleepTime | Asthma | KidneyDisease | SkinCancer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | No | 16.60 | Yes | No | No | 3.0 | 30.0 | No | Female | 55-59 | White | Yes | Yes | Very good | 5.0 | Yes | No | Yes |
| 1 | No | 20.34 | No | No | Yes | 0.0 | 0.0 | No | Female | 80 or older | White | No | Yes | Very good | 7.0 | No | No | No |
| 2 | No | 26.58 | Yes | No | No | 20.0 | 30.0 | No | Male | 65-69 | White | Yes | Yes | Fair | 8.0 | Yes | No | No |
| 3 | No | 24.21 | No | No | No | 0.0 | 0.0 | No | Female | 75-79 | White | No | No | Good | 6.0 | No | No | Yes |
| 4 | No | 23.71 | No | No | No | 28.0 | 0.0 | Yes | Female | 40-44 | White | No | Yes | Very good | 8.0 | No | No | No |
El dataset seleccionado estudia los casos en el que el encuestado ha presentado enferemedad coronaria o infarto de miocardio en funcion de otras características.
Se procede a estudiar la existencia de valores faltantes que requieran una imputación.
df.isna().sum()
HeartDisease 0 BMI 0 Smoking 0 AlcoholDrinking 0 Stroke 0 PhysicalHealth 0 MentalHealth 0 DiffWalking 0 Sex 0 AgeCategory 0 Race 0 Diabetic 0 PhysicalActivity 0 GenHealth 0 SleepTime 0 Asthma 0 KidneyDisease 0 SkinCancer 0 dtype: int64
No existen valores faltantes en el dataset.
df.HeartDisease.value_counts()
No 292422 Yes 27373 Name: HeartDisease, dtype: int64
px.pie(df,names="HeartDisease")
plt.figure(figsize=(12,7))
sns.kdeplot(data=df,x="BMI",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='BMI', ylabel='Density'>
plt.figure(figsize=(12,7))
sns.kdeplot(data=df,x="SleepTime",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='SleepTime', ylabel='Density'>
plt.figure(figsize=(12,7))
sns.kdeplot(data=df,x="PhysicalHealth",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='PhysicalHealth', ylabel='Density'>
plt.figure(figsize=(12,7))
sns.kdeplot(data=df,x="MentalHealth",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='MentalHealth', ylabel='Density'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="Smoking",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='Smoking', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="Stroke",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='Stroke', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="DiffWalking",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='DiffWalking', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="Race",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='Race', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="Diabetic",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='Diabetic', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="PhysicalActivity",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='PhysicalActivity', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="Asthma",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='Asthma', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="KidneyDisease",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='KidneyDisease', ylabel='Count'>
plt.figure(figsize=(12,7))
sns.histplot(data=df,x="SkinCancer",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='SkinCancer', ylabel='Count'>
Se procede a convertir en tipo numérico todas las variables para poder emplear los modelos.
df.Race = df.Race.replace({"White":1, "Hispanic":2, "Black":3, "Other":4, "Asian":5, "American Indian/Alaskan Native":6})
df = df[df.columns].replace({"Yes":1, "No":0,"No, borderline diabetes":0, "Yes (during pregnancy)":1})
df = df[df.columns].replace({"Male":1, "Female":0})
df.AgeCategory = df.AgeCategory.replace({"18-24":0, "25-29":1, "30-34":2, "35-39":3, "40-44":4, "45-49":5,
"50-54":6, "55-59":7, "60-64":8, "65-69":9, "70-74":10, "75-79": 11, "80 or older": 12})
df.AgeCategory.value_counts()
9 34151 8 33686 10 31065 7 29757 6 25382 12 24153 5 21791 11 21482 0 21064 4 21006 3 20550 2 18753 1 16955 Name: AgeCategory, dtype: int64
df.GenHealth = df.GenHealth.replace({"Excellent":0, "Very good":1, "Fair":2, "Good":3, "Poor":4})
df.GenHealth.value_counts()
1 113858 3 93129 0 66842 2 34677 4 11289 Name: GenHealth, dtype: int64
plt.figure(figsize=(15,10))
sns.kdeplot(data=df,x="GenHealth",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='GenHealth', ylabel='Density'>
plt.figure(figsize=(15,10))
sns.kdeplot(data=df,x="AgeCategory",hue="HeartDisease", multiple="stack")
<AxesSubplot:xlabel='AgeCategory', ylabel='Density'>
Previamente se realiza una codificacion adecuada de las variables.
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 319795 entries, 0 to 319794 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 HeartDisease 319795 non-null int64 1 BMI 319795 non-null float64 2 Smoking 319795 non-null int64 3 AlcoholDrinking 319795 non-null int64 4 Stroke 319795 non-null int64 5 PhysicalHealth 319795 non-null float64 6 MentalHealth 319795 non-null float64 7 DiffWalking 319795 non-null int64 8 Sex 319795 non-null int64 9 AgeCategory 319795 non-null int64 10 Race 319795 non-null int64 11 Diabetic 319795 non-null int64 12 PhysicalActivity 319795 non-null int64 13 GenHealth 319795 non-null int64 14 SleepTime 319795 non-null float64 15 Asthma 319795 non-null int64 16 KidneyDisease 319795 non-null int64 17 SkinCancer 319795 non-null int64 dtypes: float64(4), int64(14) memory usage: 43.9 MB
y = pd.DataFrame(df["HeartDisease"], columns=["HeartDisease"])
#Codificador nominal
label_vars = ["Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Sex", "Race", "Diabetic",
"PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]
new_label_vars = ["SmokingYes", "SmokingNo", "AlcoholDrinkingNo", "AlcoholDrinkingYes", "StrokeNo", "StrokeYes", "DiffWalkingNo",
"DiffWalkingYes", "Male", "Female", "White", "Black", "Asian", "American Indian/Alaskan Native",
"Other", "Hispanic", "DiabeticYes", "DiabeticNo", "PhysicalActivityYes", "PhysicalActivityNo",
"AsthmaYes", "AsthmaNo", "KidneyDiseaseNo", "KidneyDiseaseYes", "SkinCancerYes", "SkinCancerNo"]
df_label = df[label_vars]
hoe = OneHotEncoder()
df_label_ohe = pd.DataFrame(hoe.fit_transform(df_label).toarray(), columns=new_label_vars)
df_label_ohe.head()
| SmokingYes | SmokingNo | AlcoholDrinkingNo | AlcoholDrinkingYes | StrokeNo | StrokeYes | DiffWalkingNo | DiffWalkingYes | Male | Female | ... | DiabeticYes | DiabeticNo | PhysicalActivityYes | PhysicalActivityNo | AsthmaYes | AsthmaNo | KidneyDiseaseNo | KidneyDiseaseYes | SkinCancerYes | SkinCancerNo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 2 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 3 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 4 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
5 rows × 26 columns
#Matriz de variables ordinales
ord_vars = ["AgeCategory", "GenHealth"]
df_ord = df[ord_vars]
oe = OrdinalEncoder()
df_ord_oe = pd.DataFrame(oe.fit_transform(df_ord), columns=ord_vars)
df_ord_oe.head()
| AgeCategory | GenHealth | |
|---|---|---|
| 0 | 7.0 | 1.0 |
| 1 | 12.0 | 1.0 |
| 2 | 9.0 | 2.0 |
| 3 | 11.0 | 3.0 |
| 4 | 4.0 | 1.0 |
#Escalo variables numericas
num_vars = ["BMI", "PhysicalHealth", "MentalHealth", "SleepTime"]
df_num = df[num_vars]
df_num.describe()
| BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|
| count | 319795.000000 | 319795.00000 | 319795.000000 | 319795.000000 |
| mean | 28.325399 | 3.37171 | 3.898366 | 7.097075 |
| std | 6.356100 | 7.95085 | 7.955235 | 1.436007 |
| min | 12.020000 | 0.00000 | 0.000000 | 1.000000 |
| 25% | 24.030000 | 0.00000 | 0.000000 | 6.000000 |
| 50% | 27.340000 | 0.00000 | 0.000000 | 7.000000 |
| 75% | 31.420000 | 2.00000 | 3.000000 | 8.000000 |
| max | 94.850000 | 30.00000 | 30.000000 | 24.000000 |
scaler = MinMaxScaler()
scaler_st = StandardScaler()
df_num_scaled = pd.DataFrame(scaler.fit_transform(df_num), columns=num_vars)
#df_num_scaled = pd.DataFrame(scaler_st.fit_transform(df_num), columns=num_vars)
df_num_scaled.head()
| BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|
| 0 | 0.055294 | 0.100000 | 1.0 | 0.173913 |
| 1 | 0.100447 | 0.000000 | 0.0 | 0.260870 |
| 2 | 0.175782 | 0.666667 | 1.0 | 0.304348 |
| 3 | 0.147169 | 0.000000 | 0.0 | 0.217391 |
| 4 | 0.141132 | 0.933333 | 0.0 | 0.304348 |
df_num_scaled.describe()
| BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|
| count | 319795.000000 | 319795.000000 | 319795.000000 | 319795.000000 |
| mean | 0.196854 | 0.112390 | 0.129946 | 0.265090 |
| std | 0.076737 | 0.265028 | 0.265175 | 0.062435 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.144996 | 0.000000 | 0.000000 | 0.217391 |
| 50% | 0.184957 | 0.000000 | 0.000000 | 0.260870 |
| 75% | 0.234215 | 0.066667 | 0.100000 | 0.304348 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
df_merged = y.merge(df_label_ohe, left_index=True, right_index=True)
df_merged = df_merged.merge(df_ord_oe, left_index=True, right_index=True)
df_merged = df_merged.merge(df_num_scaled, left_index=True, right_index=True)
df_merged.head()
| HeartDisease | SmokingYes | SmokingNo | AlcoholDrinkingNo | AlcoholDrinkingYes | StrokeNo | StrokeYes | DiffWalkingNo | DiffWalkingYes | Male | ... | KidneyDiseaseNo | KidneyDiseaseYes | SkinCancerYes | SkinCancerNo | AgeCategory | GenHealth | BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 7.0 | 1.0 | 0.055294 | 0.100000 | 1.0 | 0.173913 |
| 1 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 12.0 | 1.0 | 0.100447 | 0.000000 | 0.0 | 0.260870 |
| 2 | 0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 9.0 | 2.0 | 0.175782 | 0.666667 | 1.0 | 0.304348 |
| 3 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 11.0 | 3.0 | 0.147169 | 0.000000 | 0.0 | 0.217391 |
| 4 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 4.0 | 1.0 | 0.141132 | 0.933333 | 0.0 | 0.304348 |
5 rows × 33 columns
#DIVISION datos
train, test = train_test_split(df_merged, test_size=0.3, random_state=123456)
X_train = train.iloc[:,1:]
X_test = test.iloc[:,1:]
y_train = train.iloc[:,0]
y_test = test.iloc[:,0]
y_train.value_counts()
0 204728 1 19128 Name: HeartDisease, dtype: int64
y_test.value_counts()
0 87694 1 8245 Name: HeartDisease, dtype: int64
#Eliminar algunos y=0
Se estudia el método de las correlaciones para conocer las variables con mayor correlación con la variable de estudio "HeartDisease"
correlation = train.corr().round(2)
plt.figure(figsize = (12,7))
sns.heatmap(correlation, annot = True, cmap = 'Blues')
<AxesSubplot:>
Nos quedamos con la columna de interes "HeartDisease" para observarla en detalle.
fig = plt.figure(figsize=(17,13))
sns.set_palette("Blues")
plt.figure(figsize=(14,7))
fig.suptitle("Correlation of features vs HearDissease")
abs(correlation['HeartDisease']).sort_values()[:-1].plot.barh()
plt.show()
<Figure size 1224x936 with 0 Axes>
Las variables "SleepTime", "AlcoholDrinking", "Race" y "MentalHealth" presentan una escasa correlación con la variable "HeartDisease" lo que significa que no explican de manera significativa la variable de estudio por lo que se podría estudiar su eliminacion del set de datos.
Se emplea la técnica de Análisis de Componentes Principales mediante la librería ScikitLearn.
from sklearn.decomposition import PCA
pca = PCA(n_components=12) #98% varainza
pca.fit(X_train)
matriz_cov = pd.DataFrame(pca.components_, columns = X_train.columns)
matriz_cov.head()
| SmokingYes | SmokingNo | AlcoholDrinkingNo | AlcoholDrinkingYes | StrokeNo | StrokeYes | DiffWalkingNo | DiffWalkingYes | Male | Female | ... | KidneyDiseaseNo | KidneyDiseaseYes | SkinCancerYes | SkinCancerNo | AgeCategory | GenHealth | BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.019151 | -0.019151 | -0.004218 | 0.004218 | 0.007545 | -0.007545 | 0.024653 | -0.024653 | -0.009550 | 0.009550 | ... | 0.006703 | -0.006703 | 0.021505 | -0.021505 | -0.995383 | -0.060745 | -0.000016 | -0.008928 | 0.011148 | -0.001824 |
| 1 | -0.077383 | 0.077383 | 0.002168 | -0.002168 | -0.018793 | 0.018793 | -0.093271 | 0.093271 | 0.003333 | -0.003333 | ... | -0.020048 | 0.020048 | 0.002552 | -0.002552 | -0.070731 | 0.959907 | 0.014770 | 0.078094 | 0.051705 | -0.003877 |
| 2 | -0.398400 | 0.398400 | -0.030358 | 0.030358 | -0.003878 | 0.003878 | 0.024100 | -0.024100 | -0.576936 | 0.576936 | ... | 0.001879 | -0.001879 | -0.017144 | 0.017144 | -0.002841 | -0.040844 | 0.000319 | -0.007029 | -0.022935 | -0.001821 |
| 3 | -0.546491 | 0.546491 | -0.050640 | 0.050640 | -0.012316 | 0.012316 | -0.103300 | 0.103300 | 0.368886 | -0.368886 | ... | -0.006065 | 0.006065 | 0.013084 | -0.013084 | -0.028119 | -0.162729 | 0.000444 | 0.054759 | 0.070041 | -0.002484 |
| 4 | 0.155523 | -0.155523 | 0.036075 | -0.036075 | -0.027498 | 0.027498 | -0.229113 | 0.229113 | -0.154970 | 0.154970 | ... | -0.030436 | 0.030436 | 0.025831 | -0.025831 | -0.009881 | -0.181803 | 0.019432 | 0.099883 | 0.025065 | -0.000755 |
5 rows × 32 columns
print("Explained variance ratio")
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
Explained variance ratio [0.7458427 0.08719624 0.03107972 0.02597748 0.0201626 0.01383879 0.01301279 0.01188361 0.01043585 0.00880458 0.00712943 0.00453164] 0.9798954229413571
Con el análisis de componentes principales podemos quedarnos con las 12 primeras componentes que explican casi el 98% de la varianza.
X_copy = X_train
X_train = pd.DataFrame(pca.transform(X_train), columns=["PCA0", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", "PCA7"
, "PCA8", "PCA9", "PCA10", "PCA11"])
X_test = pd.DataFrame(pca.transform(X_test), columns=["PCA0", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", "PCA7"
, "PCA8", "PCA9", "PCA10", "PCA11"])
X_train.head()
| PCA0 | PCA1 | PCA2 | PCA3 | PCA4 | PCA5 | PCA6 | PCA7 | PCA8 | PCA9 | PCA10 | PCA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -3.393277 | -1.008469 | 0.335485 | -0.979057 | -0.069551 | -0.102595 | -0.118780 | 0.089390 | 0.024807 | -0.289164 | -0.034793 | -0.027409 |
| 1 | 2.407848 | 1.540871 | -0.933948 | -0.053609 | 0.474325 | -0.856266 | -0.533445 | -0.055451 | -0.418670 | -0.089638 | -0.114906 | 0.001112 |
| 2 | -1.620597 | 1.345584 | -0.099757 | 0.604758 | -0.744438 | 0.166127 | 0.615468 | -0.830547 | -0.761857 | -0.155150 | -0.108000 | -0.100158 |
| 3 | -1.340652 | -1.819618 | 0.379762 | -0.753022 | 0.137317 | 0.008939 | -0.126913 | -0.031590 | 0.077009 | -0.243133 | -0.046609 | -0.000147 |
| 4 | 2.646283 | -1.479043 | -0.852333 | 0.062067 | 0.077038 | -0.044474 | 0.762354 | 0.317375 | 0.228916 | 0.148232 | -0.056153 | 0.733752 |
pca3 = PCA(n_components=3)
result = pd.DataFrame(pca3.fit_transform(X_train), columns=["PCA0", "PCA1", "PCA2"])
#result = result * -1
y = pd.DataFrame(y_train, columns=["HeartDisease"])
y.HeartDisease =pd.Categorical(y.HeartDisease)
my_color=y.HeartDisease.cat.codes
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=my_color, cmap="Set2_r", s=60)
# make simple, bare axis lines through space:
xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
# label the axes
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
#ax.set_zlabel("PC3")
ax.set_title("PCA 3D")
plt.show()
Se realiza un DBSCAN para tratar de etiquetar cada nodo que aparece en el plot en 3D de las componentes principales.
from sklearn.cluster import DBSCAN
dbs = DBSCAN(eps=0.2).fit(result)
clusters_DBS = dbs.labels_
pca3 = PCA(n_components=3)
result = pd.DataFrame(pca3.fit_transform(X_train), columns=["PCA0", "PCA1", "PCA2"])
#result = result * -1
y = pd.DataFrame(clusters_DBS, columns=["clusters_DBS"])
y.clusters_DBS =pd.Categorical(y.clusters_DBS)
my_color=y.clusters_DBS.cat.codes
fig = plt.figure(figsize = (20,20))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=my_color, cmap="Set2_r", s=60)
# make simple, bare axis lines through space:
xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
# label the axes
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
#ax.set_zlabel("PC3")
ax.set_title("clusters_DBS")
plt.show()
from sklearn.cluster import KMeans
km = KMeans(n_clusters=65, n_init=2, algorithm="auto", random_state=123456)
pred_train = km.fit_predict(X_train)
#print(classification_report(y_train, pred_train))
#print(confusion_matrix(y_train, pred_train))
pca3 = PCA(n_components=3)
result = pd.DataFrame(pca3.fit_transform(result), columns=["PCA0", "PCA1", "PCA2"])
#result = result * -1
y = pd.DataFrame(pred_train, columns=["clusters_km"])
y.clusters_km =pd.Categorical(y.clusters_km)
my_color=y.clusters_km.cat.codes
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=my_color, cmap="Set2_r", s=60)
# make simple, bare axis lines through space:
xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
# label the axes
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
#ax.set_zlabel("PC3")
ax.set_title("clusters_KMeans")
plt.show()
from sklearn import linear_model
log = linear_model.LogisticRegression(solver="liblinear", penalty="l1", C=0.01)
log.fit(X_train, y_train)
log.coef_
array([[-0.32124228, 0.46880403, 0.52303208, 0.00725687, 0.27445878,
0.55967098, -0.02702171, -0.27818591, 0.04102488, 0. ,
-0.11496308, 0.39287858]])
pred = log.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support
0 0.92 0.99 0.95 87694
1 0.51 0.06 0.10 8245
accuracy 0.91 95939
macro avg 0.71 0.53 0.53 95939
weighted avg 0.88 0.91 0.88 95939
print(confusion_matrix(y_test, pred))
[[87230 464] [ 7765 480]]
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
KNeighborsClassifier()
print('Accuracy of K-NN classifier on training set: {}'
.format(knn.score(X_train, y_train)))
Accuracy of K-NN classifier on training set: 0.9253627331856193
pred = knn.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
precision recall f1-score support
0 0.92 0.98 0.95 87694
1 0.34 0.12 0.18 8245
accuracy 0.90 95939
macro avg 0.63 0.55 0.56 95939
weighted avg 0.87 0.90 0.88 95939
[[85807 1887]
[ 7262 983]]
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
DecisionTreeClassifier()
pred = tree.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support
0 0.93 0.93 0.93 87694
1 0.23 0.23 0.23 8245
accuracy 0.87 95939
macro avg 0.58 0.58 0.58 95939
weighted avg 0.87 0.87 0.87 95939
print(confusion_matrix(y_test, pred))
[[81320 6374] [ 6333 1912]]
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
RandomForestClassifier()
pred = rf.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support
0 0.92 0.96 0.94 87694
1 0.29 0.16 0.21 8245
accuracy 0.89 95939
macro avg 0.61 0.56 0.57 95939
weighted avg 0.87 0.89 0.88 95939
print(confusion_matrix(y_test, pred))
[[84484 3210] [ 6933 1312]]
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train, y_train)
SVC()
pred = svm.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support
0 0.91 1.00 0.96 87694
1 0.67 0.00 0.00 8245
accuracy 0.91 95939
macro avg 0.79 0.50 0.48 95939
weighted avg 0.89 0.91 0.87 95939
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
#LOG
penalty = ["l1", "l2"]
solvers = ['newton-cg', 'lbfgs', 'liblinear']
c_values = [100, 10, 1.0, 0.1, 0.01]
log_grid = dict(solver=solvers,penalty=penalty,C=c_values)
#KNN
n_neighbors = range(1, 5, 1)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
knn_grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
#TREE
criterion = ["gini", "entropy"]
max_depth = range(1,30,2)
max_features = range(1, 5, 1)
tree_grid = dict(criterion=criterion, max_depth=max_depth, max_features=max_features)
#RF
n_estimators = [10, 100, 300, 500, 1000]
criterion = ["gini", "entropy"]
max_depth = range(1,20,2)
max_features = range(1, 5, 1)
rf_grid = dict(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, max_features=max_features)
#SVM
kernel = ["linear","poly", "rbf", "sigmoid"]
degree = range(3,6,1)
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
svc_grid = dict(kernel=kernel,degree=degree,C=C,gamma=gamma)
cols = ["Optimization", 'LOG', 'KNN','TREE','RF','SVM']
results = pd.DataFrame(columns=cols)
results["Optimization"] = ["Standard","RandomSearch"]
results = results.set_index(["Optimization"])
results["LOG"] = [0,0]
results["KNN"] = [0,0]
results["TREE"] = [0,0]
results["RF"] = [0,0]
results["SVM"] = [0,0]
results.head()
| LOG | KNN | TREE | RF | SVM | |
|---|---|---|---|---|---|
| Optimization | |||||
| Standard | 0 | 0 | 0 | 0 | 0 |
| RandomSearch | 0 | 0 | 0 | 0 | 0 |
models = [log, knn, tree, rf, svm]
grids = [log_grid, knn_grid, tree_grid, rf_grid, svc_grid]
col = 0
for model in models:
#model.fit(X_train,y_train.values.ravel())
results.iloc[0,col] = model.score(X_test,y_test)
col += 1
results.head()
| LOG | KNN | TREE | RF | SVM | |
|---|---|---|---|---|---|
| Optimization | |||||
| Standard | 0.914227 | 0.904637 | 0.867843 | 0.894277 | 0.914091 |
| RandomSearch | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
for ind in range(0,len(models)):
print(ind)
0 1 2 3 4
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
col = 0
for ind in range(0,len(models)):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_iter_search = 30
random_search = RandomizedSearchCV(models[col], param_distributions=grids[col], n_iter=n_iter_search, cv=cv)
random_search.fit(X_train,y_train)
results.iloc[1,col] = random_search.score(X_test,y_test)
col += 1
modelo = results.columns[ind]
print('Model: %s' % modelo)
print('Best Score: %s' % random_search.best_score_)
print('Best Hyperparameters: %s' % random_search.best_params_)
results.iloc[1,0] = 0.91497
results.iloc[1,1] = 0.91068
results.iloc[1,2] = 0.91460
results.iloc[1,3] = 0
results = results[["LOG", "KNN", "TREE", "RF", "SVM"]]
results.head()
| LOG | KNN | TREE | RF | SVM | |
|---|---|---|---|---|---|
| Optimization | |||||
| Standard | 0.914227 | 0.904637 | 0.867551 | 0.894277 | 0 |
| RandomSearch | 0.914970 | 0.910680 | 0.914600 | 0.000000 | 0 |
from sklearn.metrics import roc_auc_score, recall_score
from sklearn import tree
#LOG_OPT
log_opt = linear_model.LogisticRegression(solver="liblinear", penalty="l1", C=0.01)
log_opt.fit(X_train, y_train)
#KNN_OPT
knn_opt = KNeighborsClassifier(weights="uniform", n_neighbors=4, metric="manhattan")
knn_opt.fit(X_train, y_train)
#TREE_OPT
tree_opt = tree.DecisionTreeClassifier(max_features=2, max_depth=5, criterion="entropy")
tree_opt.fit(X_train, y_train)
#RF_OPT
rf_opt = RandomForestClassifier(weights="uniform", n_neigbors=4, metric="manhattan")
rf_opt.fit(X_train, y_train)
#SVM_OPT
svm_opt = svm.SVC()
svm_opt.fit(X_train, y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features=2)
models_opt = ["log_opt", "knn_opt", "tree_opt", "rf_opt", "svm_opt"]
cols = ["Metric", 'LOG_OPT', 'KNN_OPT','TREE_OPT','RF_OPT','SVM_OPT']
test_metrics = pd.DataFrame(columns=cols)
test_metrics["Metric"] = ["ROC","RECALL_SCORE"]
test_metrics = test_metrics.set_index("Metric")
test_metrics["LOG_OPT"] = [0,0]
test_metrics["KNN_OPT"] = [0,0]
test_metrics["TREE_OPT"] = [0,0]
test_metrics["RF_OPT"] = [0,0]
test_metrics["SVM_OPT"] = [0,0]
test_metrics.head()
| LOG_OPT | KNN_OPT | TREE_OPT | |
|---|---|---|---|
| Metric | |||
| ROC | 0 | 0 | 0 |
| RECALL_SCORE | 0 | 0 | 0 |
La métrica roc_auc_score establece el cociente entre entre verdaderos positivos y falsos positivos.
for model in models_opt:
print(model)
log_opt knn_opt tree_opt
#LOG
y_proba = log_opt.predict_proba(X_test)
roc_log = roc_auc_score(y_test, y_proba[:,1], multi_class="ovr")
test_metrics.iloc[0,0] = roc_log
#KNN
y_proba = knn_opt.predict_proba(X_test)
roc_knn = roc_auc_score(y_test, y_proba[:,1], multi_class="ovr")
test_metrics.iloc[0,1] = roc_knn
#TREE
y_proba = tree_opt.predict_proba(X_test)
roc_tree = roc_auc_score(y_test, y_proba[:,1], multi_class="ovr")
test_metrics.iloc[0,2] = roc_tree
#RF
y_proba = rf_opt.predict_proba(X_test)
roc_rf = roc_auc_score(y_test, y_proba[:,1], multi_class="ovr")
test_metrics.iloc[0,3] = roc_rf
#SVM
y_proba = svm_opt.predict_proba(X_test)
roc_svm = roc_auc_score(y_test, y_proba[:,1], multi_class="ovr")
test_metrics.iloc[0,4] = roc_svm
#LOG
pred = log_opt.predict(X_test)
recall_log = recall_score(y_test, pred)
test_metrics.iloc[1,0] = recall_log
#KNN
pred = knn_opt.predict(X_test)
recall_knn = recall_score(y_test, pred)
test_metrics.iloc[1,1] = recall_knn
#TREE
pred = tree_opt.predict(X_test)
recall_tree = recall_score(y_test, pred)
test_metrics.iloc[1,2] = recall_tree
#RF
pred = rf_opt.predict(X_test)
recall_rf = recall_score(y_test, pred)
test_metrics.iloc[1,3] = recall_rf
#SVM
pred = svm_opt.predict(X_test)
recall_svm = recall_score(y_test, pred)
test_metrics.iloc[1,4] = recall_svm
La metrica Recall establece el cociente entre verdaderos positivos entre positivos totales.
results.head()
| LOG | KNN | TREE | RF | SVM | |
|---|---|---|---|---|---|
| Optimization | |||||
| Standard | 0.914227 | 0.904637 | 0.867551 | 0.894277 | 0 |
| RandomSearch | 0.914970 | 0.910680 | 0.914600 | 0.000000 | 0 |
test_metrics.head()
| LOG_OPT | KNN_OPT | TREE_OPT | |
|---|---|---|---|
| Metric | |||
| ROC | 0.825160 | 0.686528 | 0.756411 |
| RECALL_SCORE | 0.058217 | 0.072165 | 0.010916 |